### Please also see README for required R packages
packages <- c(
  "tidyverse",
  "firatheme",
  "ggplot2",
  "Hmisc",
  "broom",
  "MASS",
  "effects",
  "cjoint",
  "scales",
  "stringr",
  "forcats",
  "patchwork",
  "gridExtra",
  "cowplot",
  "dplyr",
  "stargazer",
  "quanteda",
  "quanteda.textstats",
  "quanteda.textplots",
  "tidytext",
  "haven"
)

lapply(packages, library, character.only = TRUE)


### load data
load("data/SI/tabSI3.RData")

### Table SI 3 
tabSI3 <- tabSI3 |>
  mutate(
    treatment = recode(treatment, "simple" = "Low", "sophisticated" = "High"),
    Treatment = paste(topic, position, treatment, sep = "/"),
    topic = factor(topic, levels = c("climate change", "budget", "refugees")),
    position = factor(position, levels = c("against", "neutral", "in favor")),
    treatment = factor(treatment, levels = c("Low", "High"))
  )

# Create the summary table with the exact ordering
tabSI3 <- tabSI3 |>
  arrange(
    match(treatment, c(
      "climate change/against/Low", "budget/against/Low", "refugees/against/Low",
      "climate change/against/High", "budget/against/High", "refugees/against/High",
      "climate change/neutral/Low", "budget/neutral/Low", "refugees/neutral/Low",
      "climate change/neutral/High", "budget/neutral/High", "refugees/neutral/High",
      "climate change/in favor/Low", "budget/in favor/Low", "refugees/in favor/Low",
      "climate change/in favor/High", "budget/in favor/High", "refugees/in favor/High"
    ))
  ) |>
  dplyr::select(
    Treatment, 
    `No. of words` = W, 
    `Characters per word (avg.)` = meanWordChars, 
    `Characters per sentence (avg.)` = meanSentenceChars, 
    `No. of characters` = C, 
    ` (%) top 1000 words from Leipzig Corpus` = per_leipzig
  )

tabSI3

### Table SI 4 

### load data
load("data/SI/tabSI4.RData")

# Create the summary table with the exact ordering
tabSI4 <- tabSI4 |>
  arrange(
    match(treatment, c(
      "climate change/against/Low", "budget/against/Low", "refugees/against/Low",
      "climate change/against/High", "budget/against/High", "refugees/against/High",
      "climate change/neutral/Low", "budget/neutral/Low", "refugees/neutral/Low",
      "climate change/neutral/High", "budget/neutral/High", "refugees/neutral/High",
      "climate change/in favor/Low", "budget/in favor/Low", "refugees/in favor/Low",
      "climate change/in favor/High", "budget/in favor/High", "refugees/in favor/High"
    ))
  )

tabSI4

### Figure SI 1 

# load data
load("data/SI/figSI1.RData")

# Ensure lr_1 is numeric
pooled$lr_1 <- as.numeric(pooled$lr_1)

# Function to fit GLM and extract z-scores
extract_z_scores <- function(var) {
  model <- glm(as.formula(paste("treatment ~", var)), data = pooled, family = binomial)
  tibble(
    var = rownames(coef(summary(model)))[-1],  # remove intercept
    z_score = coef(summary(model))[-1, "z value"]
  )
}

# Define variables to include (excluding treatment)
vars <- setdiff(names(pooled), "treatment")

# Run GLMs for all variables and combine results
z <- map_dfr(vars, extract_z_scores)

# Clean variable names
z <- z |> 
  filter(!str_detect(var, "Intercept")) |>  # Remove all intercept terms
  filter(!var %in% c("voteEine andere Partei, und zwar:", "voteIch weiß es nicht", "voteIch würde nicht wählen gehen",
                     "votekeine Angabe", "edu_schoolAnderer Schulabschluss, und zwar:", "classkeiner dieser Schichten",
                     "genderkeine Angabe", "edukeine Angabe", "pol_interestkeine Angabe", "classIch weiß es nicht",
                     "classkeine Angabe", "edu_schoolkeine Angabe")) |> 
  mutate(var = if_else(var == "lr_1", "left-right", var))

# Plot results
ggplot(z, aes(x = z_score, y = var)) + 
  annotate("rect", xmin = -3, xmax = 3, ymin = "voteBündnis 90/Die Grünen", ymax = "voteSPD (Sozialdemokratische Partei Deutschlands)",
           alpha = .2, fill = "blue") +
  annotate("rect", xmin = -3, xmax = 3, ymin = "pol_interestweniger stark", ymax = "pol_interestmittelmäßig",
           alpha = .2, fill = "blue") +
  annotate("rect", xmin = -3, xmax = 3, ymin = "genderweiblich", ymax = "gendermännlich",
           alpha = .2, fill = "blue") +
  annotate("rect", xmin = -3, xmax = 3, ymin = "edu_schoolVolks-/Hauptschulabschluss", ymax = "edu_schoolFachhochschulreife",
           alpha = .2, fill = "blue") +
  annotate("rect", xmin = -3, xmax = 3, ymin = "classder Mittelschicht", ymax = "classder Unterschicht",
           alpha = .2, fill = "blue") +
  geom_point(size = 1, color = "gray60") + 
  theme_fira() + 
  scale_x_continuous(limits = c(-3, 3), breaks = c(-1.96, 1.96)) + 
  geom_vline(xintercept = c(-1.96, 1.96))

# Table SI 8 

# load data
load("data/fig3/rawtext_pooled.RData")

# run models
model1 <- lm(cosine ~ treatment + fe, data = rawtext_pooled)
model2 <- lm(cosine ~ treatment + issue + fe, data = rawtext_pooled)
model3 <- lm(cosine ~ treatment + issue + position + fe, data = rawtext_pooled)
model4 <- lm(cosine ~ treatment*party_match + issue + position + fe, data = rawtext_pooled)

# extract HC1
model1 <- coeftest(model1, vcov=vcovHC(model1, type = ("HC1"), model1$ResponseId))
model2 <- coeftest(model2, vcov=vcovHC(model2, type = ("HC1"), model2$ResponseId))
model3 <- coeftest(model3, vcov=vcovHC(model3, type = ("HC1"), model3$ResponseId))
model4 <- coeftest(model4, vcov=vcovHC(model4, type = ("HC1"), model4$ResponseId))

# make table
stargazer(model1, model2, model3, model4)

# Table SI 9 

# load data
load("data/fig3/preprocessed_pooled.RData")

# run models
model1 <- lm(cosine ~ treatment + fe, data = preprocessed_pooled)
model2 <- lm(cosine ~ treatment + issue + fe, data = preprocessed_pooled)
model3 <- lm(cosine ~ treatment + issue + position + fe, data = preprocessed_pooled)
model4 <- lm(cosine ~ treatment*party_match + issue + position + fe, data = preprocessed_pooled)

# extract HC1
model1 <- coeftest(model1, vcov=vcovHC(model1, type = ("HC1"), model1$ResponseId))
model2 <- coeftest(model2, vcov=vcovHC(model2, type = ("HC1"), model2$ResponseId))
model3 <- coeftest(model3, vcov=vcovHC(model3, type = ("HC1"), model3$ResponseId))
model4 <- coeftest(model4, vcov=vcovHC(model4, type = ("HC1"), model4$ResponseId))

# make table
stargazer(model1, model2, model3, model4)

# Table SI 10 

# load data
load("data/fig3/embeddings_pooled.RData")

# run models
model1 <- lm(cosine_scores ~ treatment + fe, data = embeddings_pooled)
model2 <- lm(cosine_scores ~ treatment + issue + fe, data = embeddings_pooled)
model3 <- lm(cosine_scores ~ treatment + issue + position + fe, data = embeddings_pooled)
model4 <- lm(cosine_scores ~ treatment*party_match + issue + position + fe, data = embeddings_pooled)

# extract HC1
model1 <- coeftest(model1, vcov=vcovHC(model1, type = ("HC1"), model1$ResponseId))
model2 <- coeftest(model2, vcov=vcovHC(model2, type = ("HC1"), model2$ResponseId))
model3 <- coeftest(model3, vcov=vcovHC(model3, type = ("HC1"), model3$ResponseId))
model4 <- coeftest(model4, vcov=vcovHC(model4, type = ("HC1"), model4$ResponseId))

# make table
stargazer(model1, model2, model3, model4)

# Table SI 11 

# load data
load("data/fig3/distance_pooled.RData")

# run models
model1 <- lm(outcome2_distance ~ treatment + fe, data = distance_pooled)
model2 <- lm(outcome2_distance ~ treatment + issue + fe, data = distance_pooled)
model3 <- lm(outcome2_distance ~ treatment + issue + position + fe, data = distance_pooled)
model4 <- lm(outcome2_distance ~ treatment*party_match + issue + position + fe, data = distance_pooled)

# extract HC1
model1 <- coeftest(model1, vcov=vcovHC(model1, type = ("HC1"), model1$ResponseId))
model2 <- coeftest(model2, vcov=vcovHC(model2, type = ("HC1"), model2$ResponseId))
model3 <- coeftest(model3, vcov=vcovHC(model3, type = ("HC1"), model3$ResponseId))
model4 <- coeftest(model4, vcov=vcovHC(model4, type = ("HC1"), model4$ResponseId))

# make table
stargazer(model1, model2, model3, model4)

# Table SI 11 

# load data
load("data/fig3/distance_pooled.RData")

# run models
model1 <- lm(outcome2_distance ~ treatment + fe, data = distance_pooled)
model2 <- lm(outcome2_distance ~ treatment + issue + fe, data = distance_pooled)
model3 <- lm(outcome2_distance ~ treatment + issue + position + fe, data = distance_pooled)
model4 <- lm(outcome2_distance ~ treatment*party_match + issue + position + fe, data = distance_pooled)

# extract HC1
model1 <- coeftest(model1, vcov=vcovHC(model1, type = ("HC1"), model1$ResponseId))
model2 <- coeftest(model2, vcov=vcovHC(model2, type = ("HC1"), model2$ResponseId))
model3 <- coeftest(model3, vcov=vcovHC(model3, type = ("HC1"), model3$ResponseId))
model4 <- coeftest(model4, vcov=vcovHC(model4, type = ("HC1"), model4$ResponseId))

# make table
stargazer(model1, model2, model3, model4)

# Table SI 12 

# load data
load("data/fig4/class_pooled.RData")

# run models
model1 <- polr(class_order ~ treatment + fe, data = class_pooled)
model2 <- polr(class_order ~ treatment + issue + fe, data = class_pooled)
model3 <- polr(class_order ~ treatment + issue + position + fe, data = class_pooled)
model4 <- polr(class_order ~ treatment*party_match + issue + position + fe, data = class_pooled)

# make table
stargazer(model1, model2, model3, model4)


# Table SI 13 

# load data
load("data/fig4/education_pooled.RData")

# run models
model1 <- polr(edu_order ~ treatment + fe, data = education_pooled)
model2 <- polr(edu_order ~ treatment + issue + fe, data = education_pooled)
model3 <- polr(edu_order ~ treatment + issue + position + fe, data = education_pooled)
model4 <- polr(edu_order ~ treatment*party_match + issue + position + fe, data = education_pooled)

# make table
stargazer(model1, model2, model3, model4)

# Table SI 14

# load data
load("data/fig5/correct_answer.RData")
load("data/fig5/gender.RData")
load("data/fig5/party.RData")

model1 <- glm(profile_gender_correct ~ treatment + issue + position + fe, data = data_correct_answer)
model2 <- glm(profile_party_correct ~ treatment + issue + position + fe, data = data_correct_answer)
model3 <- polr(profil_party_0_GROUP ~ treatment + issue + position + fe, data = data_party)
model4 <- polr(profil_gender_0_GROUP ~ treatment + issue + position + fe, data = data_gender)

# make table
stargazer(model1, model2, model3, model4)

# Figure SI 2

# Load data
load("data/SI/figSI2_figSI3.RData")

model1 <- glm(class_binary ~ treatment + issue + position, data = class_w1_binary)
model2 <- glm(class_binary ~ treatment + issue + position, data = class_w2_binary)
model3 <- glm(class_binary ~ treatment + issue + position, data = class_w3_binary)
model4 <- glm(class_binary ~ treatment + issue + position + fe, data = class_pooled_binary)

# Function to extract predicted probabilities for "treatment" variable
extract_glm_effects <- function(model, id_label, facet_group) {
  effect_obj <- Effect(focal.predictors = "treatment", mod = model, xlevels = list(treatment = c("simple", "sophisticated")))
  effect_data <- as.data.frame(effect_obj)
  
  effect_data |> 
    mutate(
      treatment = factor(treatment, levels = c("simple", "sophisticated")),
      id = id_label,
      facet = facet_group
    ) |> 
    rename(prob = fit, lower = lower, upper = upper) |>  
    dplyr::select(treatment, prob, lower, upper, id, facet)
}

# Extract effects for all models
glm_data <- bind_rows(
  extract_glm_effects(model4, "pooled", "pooled estimates"),  
  extract_glm_effects(model1, "wave1", "wave estimates"),
  extract_glm_effects(model2, "wave2", "wave estimates"),
  extract_glm_effects(model3, "wave3", "wave estimates")
)

# Define colors manually
color_mapping <- c("simple" = "gray50", "sophisticated" = "black")

# Create the faceted plot
ggplot(glm_data, aes(x = id, y = prob, color = treatment)) +
  geom_point(position = position_dodge(width = 0.4), size = 2) +
  geom_linerange(aes(ymin = lower, ymax = upper), position = position_dodge(width = 0.4), alpha = 0.8) +
  scale_y_continuous(limits = c(0.3, 0.6), expand = c(0, 0)) +
  scale_color_manual(values = color_mapping) +
  labs(y = "probability", x = NULL) +
  facet_wrap(~facet, scales = "free_x", nrow = 1) +  # Faceting by "pooled estimates" vs. "wave estimates"
  theme_fira() +
  theme(
    legend.position = "right",
    legend.title = element_blank(),
    legend.text = element_text(size = 12),
    axis.title.y = element_text(size = 16),
    axis.text.x = element_text(size = 14),
    panel.grid.minor = element_blank(),
    plot.margin = margin(10, 10, 10, 10),
    strip.text = element_text(size = 14, color = "darkorange")  # Highlight facet labels
  )


### Figure SI 3

# Load data
load("data/SI/figSI2_figSI3.RData")

model1 <- glm(edu_binary ~ treatment + issue + position, data = education_w1_binary)
model2 <- glm(edu_binary ~ treatment + issue + position, data = education_w2_binary)
model3 <- glm(edu_binary ~ treatment + issue + position, data = education_w3_binary)
model4 <- glm(edu_binary ~ treatment + issue + position + fe, data = education_pooled_binary)

# Function to extract predicted probabilities
extract_glm_effects <- function(model, id_label, facet_group) {
  effect_obj <- Effect(focal.predictors = "treatment", mod = model)
  effect_data <- as.data.frame(effect_obj)
  
  effect_data |> 
    mutate(
      treatment = factor(treatment, levels = c("simple", "sophisticated")),
      id = id_label,
      facet = facet_group
    ) |> 
    rename(prob = fit, lower = lower, upper = upper) |>  
    dplyr::select(treatment, prob, lower, upper, id, facet)
}

# Extract effects for all education models
glm_data <- bind_rows(
  extract_glm_effects(model4, "pooled", "pooled estimates"),
  extract_glm_effects(model1, "wave1", "wave estimates"),
  extract_glm_effects(model2, "wave2", "wave estimates"),
  extract_glm_effects(model3, "wave3", "wave estimates")
)

# Define colors manually
color_mapping <- c("simple" = "gray50", "sophisticated" = "black")

ggplot(glm_data, aes(x = id, y = prob, color = treatment)) +
  geom_point(position = position_dodge(width = 0.4), size = 2) +
  geom_linerange(aes(ymin = lower, ymax = upper), position = position_dodge(width = 0.4), alpha = 0.8) +
  scale_y_continuous(limits = c(0.3, 0.6), expand = c(0, 0)) +  # Adjust limits based on probabilities
  scale_color_manual(values = color_mapping) +
  labs(y = "probability", x = NULL) +
  facet_wrap(~facet, scales = "free_x", nrow = 1) +  # Faceting for pooled and wave estimates
  theme_fira() +
  theme(
    legend.position = "right",
    legend.title = element_blank(),
    legend.text = element_text(size = 12),
    axis.title.y = element_text(size = 16),
    axis.text.x = element_text(size = 14),
    panel.grid.minor = element_blank(),
    plot.margin = margin(10, 10, 10, 10),
    strip.text = element_text(size = 14, color = "darkorange")  # Highlight facet labels
  )


# Table SI 15 

# Load data
load("data/SI/tabSI15.RData")

model1 <- lm(outcome2_distance ~ treatment + interest + issue + position + fe, data = tabSI15)
model2 <- lm(outcome2_distance ~ treatment * interest + issue + position + fe, data = tabSI15)
model3 <- lm(outcome2_distance ~ treatment + knowledge + issue + position + fe, data = tabSI15)
model4 <- lm(outcome2_distance ~ treatment * knowledge + issue + position + fe, data = tabSI15)

stargazer(model1, model2, model3, model4)


# Table SI 16 

# Load data
load("data/SI/tabSI16.RData")

model_afd1 <- polr(class_order ~ treatment + AfD + issue + position + fe, data = pooled_4)
model_afd2 <- polr(class_order ~ treatment * AfD + issue + position + fe, data = pooled_4)
model_afd3 <- polr(class_order ~ treatment + AfD2 + issue + position + fe, data = pooled_4)
model_afd4 <- polr(class_order ~ treatment * AfD2 + issue + position + fe, data = pooled_4)


model_afd5 <- polr(edu_order ~ treatment + AfD + issue + position + fe, data = pooled_4)
model_afd6 <- polr(edu_order ~ treatment * AfD + issue + position + fe, data = pooled_4)
model_afd7 <- polr(edu_order ~ treatment + AfD2 + issue + position + fe, data = pooled_4)
model_afd8 <- polr(edu_order ~ treatment * AfD2 + issue + position + fe, data = pooled_4)

stargazer(model_afd1, model_afd2, model_afd3, model_afd4) 
stargazer(model_afd5, model_afd6, model_afd7, model_afd8)

# Table SI 17 

# Load data
load("data/SI/tabSI17.RData")

model_class1 <- polr(class_order ~ treatment + respondent_class + issue + position + fe, data = pooled_4)
model_class2 <- polr(class_order ~ treatment * respondent_class + issue + position + fe, data = pooled_4)

model_edu1 <- polr(edu_order ~ treatment + respondent_edu + issue + position + fe, data = pooled_4)
model_edu2 <- polr(edu_order ~ treatment * respondent_edu + issue + position + fe, data = pooled_4)

stargazer(model_class1, model_class2, model_edu1, model_edu2)

# Table SI 18 

# Load data 
load("data/SI/tabSI18.RData")

model1 <- lm(cosine ~ treatment + fe, data = pooled)
model2 <- lm(cosine ~ treatment + issue + fe, data = pooled)
model3 <- lm(cosine ~ treatment + issue + position + fe, data = pooled)
model4 <- lm(cosine ~ treatment * party_match + issue + position + fe, data = pooled)

stargazer(model1, model2, model3, model4)

# Table SI 19 

# Load data
load("data/SI/tabSI19.RData")

model1 <- lm(outcome2_distance ~ treatment + fe, data = pooled)
model2 <- lm(outcome2_distance ~ treatment + issue + fe, data = pooled)
model3 <- lm(outcome2_distance ~ treatment + issue + position + fe, data = pooled)
model4 <- lm(outcome2_distance ~ treatment * party_match + issue + position + fe, data = pooled)

stargazer(model1, model2, model3, model4)

# Table SI 20

# Load data
load("data/SI/tabSI20_SI21.RData")

model1 <- polr(class_order ~ treatment  + fe, data = pooled_4)
model2 <- polr(class_order ~ treatment + issue + fe, data = pooled_4)
model3 <- polr(class_order ~ treatment + issue + position + fe, data = pooled_4)
model4 <- polr(class_order ~ treatment * party_match + issue + position + fe, data = pooled_4)

stargazer(model1, model2,model3, model4)

# Table SI 21

# Load data
load("data/SI/tabSI20_SI21.RData")

model1 <- polr(edu_order ~ treatment + fe, data = pooled_4)
model2 <- polr(edu_order ~ treatment + issue + fe, data = pooled_4)
model3 <- polr(edu_order ~ treatment + issue + position + fe, data = pooled_4)
model4 <- polr(edu_order ~ treatment * party_match + issue + position + fe, data = pooled_4)

stargazer(model1, model2,model3, model4)


# Figure SI 5 - Left panel

# Load data
load("data/SI/figSI5.RData")

highcosine <- subset(pooled, cosine > 0.30)
highcosine <- corpus(highcosine$outcome1)

toks_ger <- tokens(highcosine, remove_punct = TRUE, remove_numbers = TRUE) %>% 
  tokens_remove(pattern = stopwords("de", source = "marimo")) %>% 
  tokens_keep(pattern = "^[\\p{script=Latn}]+$", valuetype = "regex")

highcosine <- subset(pooled, cosine > 0.30)
dfm <- dfm(toks_ger)
dfm@docvars$treatment <- highcosine$treatment
keyness <- textstat_keyness(dfm,target = dfm@docvars$treatment == "1")
textplot_keyness(keyness, margin = 0.2, n = 20, color = c("lightblue", "gray60"), show_legend = T, show_reference = F) + xlim(0,200) + ggtitle("Higher cosine: >0.30")

# Figure SI 5 - Right panel 

# Load data
load("data/SI/figSI5.RData")

lowcosine <- subset(pooled, cosine < 0.15)
lowcosine <- corpus(lowcosine$outcome1)

toks_ger <- tokens(lowcosine, remove_punct = TRUE, remove_numbers = TRUE) %>% 
  tokens_remove(pattern = stopwords("de", source = "marimo")) %>% 
  tokens_keep(pattern = "^[\\p{script=Latn}]+$", valuetype = "regex")

lowcosine <- subset(pooled, cosine < 0.15)
dfm <- dfm(toks_ger)
dfm@docvars$treatment <- lowcosine$treatment
keyness <- textstat_keyness(dfm,target = dfm@docvars$treatment == "1")
textplot_keyness(keyness, margin = 0.2, n = 20, color = c("lightblue", "gray60"), show_legend = T, show_reference = F) + xlim(0,200) + ggtitle("Low cosine: <0.15")


# Figure SI 6 and Figure SI 7

load("data/SI/figSI6.RData")
load("data/SI/figSI7.RData")

extract_amce <- function(model, dataset_label) {
  s <- summary(model)
  
  get_clean_df <- function(df, label) {
    df |> 
      as_tibble() |> 
      dplyr::select(Attribute, Level, Estimate, `Std. Err`, `z value`) |> 
      mutate(
        group = label,
        upper = Estimate + 1.96 * `Std. Err`,
        lower = Estimate - 1.96 * `Std. Err`,
        dataset = dataset_label
      )
  }
  
  afd <- get_clean_df(s$afd1amce, "AfD")
  other <- get_clean_df(s$afd2amce, "other party")  
  
  baselines <- s$baselines_amce |> 
    as_tibble() |> 
    bind_rows(s$baselines_amce) |> 
    mutate(
      Estimate = 0,
      `Std. Err` = 0.0001,
      `z value` = NA_real_,
      upper = Estimate + 1.96 * `Std. Err`,
      lower = Estimate - 1.96 * `Std. Err`,
      group = rep(c("AfD", "other party"), each = nrow(s$baselines_amce)),
      dataset = dataset_label
    ) |> 
    dplyr::select(Attribute, Level, Estimate, `Std. Err`, `z value`, upper, lower, group, dataset)
  
  all_data <- bind_rows(afd, other, baselines)
  
  # Recode level labels first
  all_data$Level <- recode(all_data$Level,
                           "to serve the party" = "serve party",
                           "to impact personally" = "impact personally",
                           "to represent ordinary people" = "represent people"
  )
  
  # Add plotorder, order, and baseline
  all_data <- all_data |>
    mutate(
      plotorder = factor(
        Attribute,
        levels = c("mp_age", "mp_party", "mp_sex", "mp_position", "mp_sophistication",
                   "mp_motivation", "mp_days", "mp_experience", "mp_occupation"),
        labels = c("age", "party", "sex", "position", "sophistication",
                   "motivation", "days", "experience", "occupation")
      ),
      order = case_when(
        Level %in% c("26", "AfD", "against", "male", "simple", "serve party", "1 day", "new", "farmer") ~ 1,
        Level %in% c("30", "Die Linke", "neutral", "female", "sophisticated", "impact personally", "2 days", "incumbent", "nurse") ~ 2,
        Level %in% c("47", "B90/Grüne", "for", "represent people", "3 days", "nurturer") ~ 3,
        Level %in% c("57", "FDP", "lawyer") ~ 4,
        Level %in% c("64", "CDU/CSU", "political scientist") ~ 5,
        Level %in% c("79", "SPD", "teacher") ~ 6,
        TRUE ~ NA_real_
      ),
      baseline = if_else(
        Level %in% c("AfD", "against", "male", "simple", "26", "new", "1 day", "farmer", "serve party"),
        "1", "0"
      )
    )
  
  return(all_data)
}

amce2 <- cjoint::amce(selected ~  afd:`mp_sex` +  afd:`mp_age` + afd:`mp_party` + afd:`mp_position` + afd:`mp_sophistication`, cjointdata2, 
                      design = MPdesign2, subset = NULL, respondent.varying = "afd", respondent.id = "respondent", cluster = FALSE, na.ignore= TRUE, weights = NULL)

amce1 <- cjoint::amce(selected ~  afd:`mp_sex` +  afd:`mp_age` + afd:`mp_party` + afd:`mp_motivation` + afd:`mp_days` +  afd:`mp_occupation` + afd:`mp_experience` + afd:`mp_position` + afd:`mp_sophistication`, 
                      cjointdata1, design = MPdesign, subset = NULL, respondent.varying = "afd", respondent.id = "respondent",  cluster = FALSE,  na.ignore= TRUE, weights = NULL)

plotdata_amce2 <- extract_amce(amce2, "Low Information")
plotdata_amce1 <- extract_amce(amce1, "High Information")

combined_amce <- bind_rows(plotdata_amce1, plotdata_amce2)

plot_amce <- function(data, title_text) {
  ggplot(data, aes(x = reorder_within(Level, -order, plotorder), y = Estimate, color = group)) +
    geom_hline(yintercept = 0, linetype = 2, color = "gray70") +
    geom_pointrange(aes(ymin = lower, ymax = upper),
                    size = 0.7, fatten = 0.5,
                    position = position_dodge(width = 0.8)) +
    coord_flip() +
    scale_x_discrete(labels = function(x) gsub("__.+", "", x)) +
    facet_wrap(~plotorder, scales = "free_y", ncol = 3) +
    scale_colour_manual(values = c("AfD" = "gray30", "other party" = "gray80")) +
    theme_fira() +
    labs(title = title_text, x = NULL, y = NULL) +
    theme(
      legend.position = "bottom",
      legend.title = element_blank(),
      axis.text.y = element_text(size = 14),
      axis.text.x = element_text(size = 14),
      strip.text = element_text(size = 14)
    )
}


plot_amce(filter(combined_amce, dataset == "Low Information"), "Low-Information Environment")
plot_amce(filter(combined_amce, dataset == "High Information"), "High-Information Environment")


# Figure SI 8 and Figure SI 9

load("data/SI/figSI8.RData")
load("data/SI/figSI9.RData")

extract_amce <- function(model, dataset_label) {
  s <- summary(model)
  
  extract_group <- function(name, label) {
    df <- as_tibble(s[[name]])
    if ("attribute" %in% names(df)) {
      df <- df |> rename(
        Attribute = attribute,
        Level = level,
        Estimate = estimate,
        `Std. Err` = `std.err`
      )
    }
    df |> 
      dplyr::select(Attribute, Level, Estimate, `Std. Err`, `z value`) |> 
      mutate(
        group = label,
        upper = Estimate + 1.96 * `Std. Err`,
        lower = Estimate - 1.96 * `Std. Err`,
        dataset = dataset_label
      )
  }
  
  # Extract all three class levels
  lower <- extract_group("class1amce", "lower")
  middle <- extract_group("class3amce", "middle")
  upper <- extract_group("class4amce", "upper")
  
  # Baselines for all
  baselines <- s$baselines_amce |> 
    as_tibble()
  if ("attribute" %in% names(baselines)) {
    baselines <- baselines |> rename(Attribute = attribute, Level = level)
  }
  
  baseline_all <- bind_rows(
    baselines, baselines, baselines
  ) |> 
    mutate(
      Estimate = 0,
      `Std. Err` = 0.0001,
      `z value` = NA_real_,
      upper = Estimate + 1.96 * `Std. Err`,
      lower = Estimate - 1.96 * `Std. Err`,
      group = rep(c("lower", "middle", "upper"), each = nrow(baselines)),
      dataset = dataset_label
    )
  
  all_data <- bind_rows(lower, middle, upper, baseline_all)
  
  all_data$Level <- recode(all_data$Level,
                           "to serve the party" = "serve party",
                           "to impact personally" = "impact personally",
                           "to represent ordinary people" = "represent people"
  )
  
  all_data <- all_data |> 
    mutate(
      plotorder = factor(
        Attribute,
        levels = c("mp_age", "mp_party", "mp_sex", "mp_position", "mp_sophistication",
                   "mp_motivation", "mp_days", "mp_experience", "mp_occupation"),
        labels = c("age", "party", "sex", "position", "sophistication",
                   "motivation", "days", "experience", "occupation")
      ),
      order = case_when(
        Level %in% c("26", "AfD", "against", "male", "simple", "serve party", "1 day", "new", "farmer") ~ 1,
        Level %in% c("30", "Die Linke", "neutral", "female", "sophisticated", "impact personally", "2 days", "incumbent", "nurse") ~ 2,
        Level %in% c("47", "B90/Grüne", "for", "represent people", "3 days", "nurturer") ~ 3,
        Level %in% c("57", "FDP", "lawyer") ~ 4,
        Level %in% c("64", "CDU/CSU", "political scientist") ~ 5,
        Level %in% c("79", "SPD", "teacher") ~ 6,
        TRUE ~ NA_real_
      ),
      baseline = if_else(
        Level %in% c("AfD", "against", "male", "simple", "26", "new", "1 day", "farmer", "serve party"),
        "1", "0"
      )
    )
  
  return(all_data)
}


amce2 <- cjoint::amce(selected ~  class:`mp_sex` +  class:`mp_age` + class:`mp_party` + class:`mp_position` + class:`mp_sophistication`, cjointdata2, design = MPdesign2, subset = NULL,
                      respondent.varying = "class", respondent.id = "respondent", cluster = FALSE, na.ignore= TRUE, weights = NULL)

amce1 <- cjoint::amce(selected ~  class:`mp_sex` +  class:`mp_age` + class:`mp_party` + class:`mp_motivation` + class:`mp_days` + class:`mp_occupation` + class:`mp_experience` + class:`mp_position` + class:`mp_sophistication`, 
                      cjointdata1, design = MPdesign, subset = NULL, respondent.varying = "class", respondent.id = "respondent", cluster = FALSE, na.ignore= TRUE, weights = NULL)


plotdata_amce2 <- extract_amce(amce2, "Low Information")
plotdata_amce1 <- extract_amce(amce1, "High Information")

plot_amce <- function(data, title_text) {
  ggplot(data, aes(x = reorder_within(Level, -order, plotorder), y = Estimate, color = group)) +
    geom_hline(yintercept = 0, linetype = 2, color = "gray70") +
    geom_pointrange(aes(ymin = lower, ymax = upper),
                    size = 0.7, fatten = 0.5,
                    position = position_dodge(width = 0.8)) +
    coord_flip() +
    scale_y_continuous(limits = c(-0.75, 0.75), breaks = seq(-0.5, 0.5, by = 0.5)) +
    scale_x_discrete(labels = ~gsub("___.*", "", .)) +
    facet_wrap(~plotorder, scales = "free_y", ncol = 3) +
    scale_colour_manual(
      values = c("lower" = "grey70", "middle" = "grey50", "upper" = "grey30"),
      name = "Class"
    ) +
    theme_fira() +
    labs(title = title_text, x = NULL, y = NULL) +
    theme(
      legend.position = "bottom",
      legend.title = element_blank(),
      axis.text.y = element_text(size = 14),
      axis.text.x = element_text(size = 14),
      strip.text = element_text(size = 14)
    )
}

plot_amce(plotdata_amce2, "Low-Information Environment")
plot_amce(plotdata_amce1, "High-Information Environment")

# Figure SI 10 and Figure SI 11

load("data/SI/figSI10.RData")
load("data/SI/figSI11.RData")

extract_amce <- function(model, dataset_label) {
  s <- summary(model)
  
  get_clean_df <- function(df, label) {
    df |> 
      as_tibble() |> 
      dplyr::select(Attribute, Level, Estimate, `Std. Err`, `z value`) |> 
      mutate(
        test = label,
        upper = Estimate + 1.96 * `Std. Err`,
        lower = Estimate - 1.96 * `Std. Err`,
        dataset = dataset_label
      )
  }
  
  failed <- get_clean_df(s$manipultest1amce, "failed")
  passed <- get_clean_df(s$manipultest3amce, "passed")
  
  # Baselines repeated for both test groups
  baselines <- s$baselines_amce |> 
    as_tibble() |> 
    bind_rows(s$baselines_amce) |> 
    mutate(
      Estimate = 0,
      `Std. Err` = 0.0001,
      `z value` = NA_real_,
      upper = Estimate + 1.96 * `Std. Err`,
      lower = Estimate - 1.96 * `Std. Err`,
      test = rep(c("failed", "passed"), each = nrow(s$baselines_amce)),
      dataset = dataset_label
    ) |> 
    dplyr::select(Attribute, Level, Estimate, `Std. Err`, `z value`, upper, lower, test, dataset)
  
  bind_rows(failed, passed, baselines)
}

# Low Information AMCE
amce2 <- cjoint::amce(selected ~  manipultest:`mp_sex` +  manipultest:`mp_age` + manipultest:`mp_party` + manipultest:`mp_position` + manipultest:`mp_sophistication`, 
                      cjointdata2, design = MPdesign2, subset = NULL, respondent.varying = "manipultest", respondent.id = "respondent", cluster = FALSE, na.ignore= TRUE, weights = NULL)

# High Information AMCE
amce1 <- cjoint::amce(selected ~  manipultest:`mp_sex` +  manipultest:`mp_age` + manipultest:`mp_party` + manipultest:`mp_motivation` + manipultest:`mp_days` +  
                        manipultest:`mp_occupation` + manipultest:`mp_experience` + manipultest:`mp_position` + manipultest:`mp_sophistication`, cjointdata1, design = MPdesign,
                      subset = NULL, respondent.varying = "manipultest", respondent.id = "respondent", cluster = FALSE, na.ignore= TRUE, weights = NULL)

plotdata_amce2 <- extract_amce(amce2, "Low Information")
plotdata_amce1 <- extract_amce(amce1, "High Information")


# Combine All Data
combined_amce <- bind_rows(plotdata_amce1, plotdata_amce2) |>
  mutate(
    test = factor(test, levels = c("failed", "passed")),
    dataset = factor(dataset, levels = c("Low Information", "High Information")),
    plotorder = factor(Attribute, levels = c(
      "mp_age", "mp_party", "mp_sex", "mp_position", "mp_sophistication",
      "mp_motivation", "mp_days", "mp_experience", "mp_occupation"
    ), labels = c(
      "age", "party", "sex", "position", "sophistication",
      "motivation", "days", "experience", "occupation"
    )),
    order = case_when(
      Level %in% c("26", "AfD", "against", "male", "simple", "to serve the party", "1 day", "new", "farmer") ~ 1,
      Level %in% c("30", "Die Linke", "neutral", "female", "sophisticated", "to impact personally", "2 days", "incumbent", "nurse") ~ 2,
      Level %in% c("47", "B90/Grüne", "for", "to represent ordinary people", "3 days", "nurturer") ~ 3,
      Level %in% c("57", "FDP", "lawyer") ~ 4,
      Level %in% c("64", "CDU/CSU", "political scientist") ~ 5,
      Level %in% c("79", "SPD", "teacher") ~ 6,
      TRUE ~ NA_real_
    ),
    Level = dplyr::recode(Level,
                          "to serve the party" = "serve party",
                          "to impact personally" = "impact personally",
                          "to represent ordinary people" = "represent people")
  )

plot_amce <- function(data, title_text) {
  ggplot(data, aes(x = reorder(Level, -order), y = Estimate, color = test)) +
    geom_hline(yintercept = 0, linetype = 2, color = "gray70") +
    geom_pointrange(aes(ymin = lower, ymax = upper),
                    size = 0.7, fatten = 0.5, position = position_dodge(width = 0.8)) +
    coord_flip() +
    scale_y_continuous(limits = c(-0.15, 0.3), breaks = c(-0.1, 0, 0.1, 0.2)) +
    facet_wrap(plotorder ~ ., scales = "free_y", ncol = 3) +
    scale_colour_manual(values = c("failed" = "gray30", "passed" = "gray80")) +
    theme_fira() +
    labs(title = title_text, x = "", y = "") +
    theme(
      legend.title = element_blank(),
      legend.position = "bottom",
      axis.text.y = element_text(size = 14, color = "black"),
      axis.text.x = element_text(size = 14, color = "black"),
      strip.text = element_text(size = 14, color = "black")
    )
}


# Generate Plots - Figure SI 10 and SI 11  
plot_amce(filter(combined_amce, dataset == "Low Information"), "Low-Information Environment")
plot_amce(filter(combined_amce, dataset == "High Information"), "High-Information Environment")

# Figure SI 12

# Load data
load("data/SI/figSI12.RData")

extract_amce_partychoice <- function(model, dataset_label) {
  s <- summary(model)
  
  get_clean_df <- function(df, label) {
    df_clean <- df |> 
      as_tibble() |> 
      filter(Attribute == "mp_position") |> 
      dplyr::select(Attribute, Level, Estimate, `Std. Err`, `z value`) |> 
      mutate(
        group = label,
        upper = Estimate + 1.96 * `Std. Err`,
        lower = Estimate - 1.96 * `Std. Err`,
        dataset = dataset_label
      )
    
    # Add baseline row for 'against'
    baseline_row <- tibble(
      Attribute = "mp_position",
      Level = "against",
      Estimate = 0,
      `Std. Err` = 0.0001,
      `z value` = NA_real_,
      upper = 0 + 1.96 * 0.0001,
      lower = 0 - 1.96 * 0.0001,
      group = label,
      dataset = dataset_label
    )
    
    bind_rows(df_clean, baseline_row)
  }
  
  spd    <- get_clean_df(s$partychoice11amce, "SPD")
  fdp    <- get_clean_df(s$partychoice8amce,  "FDP")
  linke  <- get_clean_df(s$partychoice10amce, "Linke")
  cdu    <- get_clean_df(s$partychoice7amce,  "CDU/CSU")
  gruene <- get_clean_df(s$partychoice9amce,  "Grüne")
  afd    <- get_clean_df(s$partychoice6amce,  "AfD")
  
  bind_rows(spd, fdp, linke, cdu, gruene, afd) |> 
    mutate(Level = as.character(Level)) |>  # reset any previous factor structure
    mutate(Level = factor(Level, levels = c("against", "neutral", "for"))
    )
}


amce2 <- cjoint::amce(selected ~  partychoice:mp_sex +  partychoice:mp_age + partychoice:mp_party + partychoice:mp_position + partychoice:mp_sophistication, cjointdata2, 
                      design = MPdesign2, subset = NULL, respondent.varying = "partychoice", respondent.id = "respondent", 
                      cluster = FALSE, na.ignore= TRUE, weights = NULL)

amce1 <- cjoint::amce(selected ~  partychoice:mp_sex +  partychoice:mp_age + partychoice:mp_party + partychoice:mp_motivation + partychoice:mp_days +  
                        partychoice:mp_occupation + partychoice:mp_experience + partychoice:mp_position + partychoice:mp_sophistication, 
                      cjointdata1, design = MPdesign, subset = NULL, respondent.varying = "partychoice", respondent.id = "respondent", 
                      cluster = FALSE, na.ignore= TRUE, weights = NULL)

# Extract data only for 'mp_position'
plotdata_amce1 <- extract_amce_partychoice(amce1, "High Information")
plotdata_amce2 <- extract_amce_partychoice(amce2, "Low Information")

# Plot function 
plot_amce_partychoice <- function(data, title_text) {
  ggplot(data, aes(x = Level, y = Estimate, color = group)) +
    geom_hline(yintercept = 0, linetype = 2, color = "gray70") +
    geom_pointrange(aes(ymin = lower, ymax = upper),
                    size = 0.7, fatten = 0.5,
                    position = position_dodge(width = 0.8)) +
    coord_flip() +
    scale_y_continuous(limits = c(-0.5, 0.5), breaks = seq(-0.5, 0.5, by = 0.25)) +
    scale_colour_manual(
      values = c("AfD" = "blue", "CDU/CSU" = "black", "FDP" = "yellow3",
                 "Grüne" = "darkgreen", "Linke" = "darkred", "SPD" = "red"),
      name = "Party"
    ) +
    theme_fira() +
    labs(title = title_text, x = NULL, y = "AMCE") +
    theme(
      legend.position = "bottom",
      legend.title = element_blank(),
      axis.text = element_text(size = 14),
      strip.text = element_text(size = 14)
    )
}

# Plot separately
plot_amce_partychoice(plotdata_amce2, "Low-Information Environment")
plot_amce_partychoice(plotdata_amce1, "High-Information Environment")

# Figure SI 13

# Load data 
load("data/SI/figSI13_figSI16.RData")

boxplot(W~country, data=german_df, notch=F, ylab = "",
        col=(c("gray90","gray90", "gray90")),
        main="Number of words", xlab="")

boxplot(C~country, data=german_df, notch=F, ylab = "",
        col=(c("gray90","gray90", "gray90")),
        main="Number of characters", xlab="")

boxplot(meanWordChars~country, data=german_df, notch=F, ylab = "",
        col=(c("gray90","gray90", "gray90")),
        main="Avergage characters \nper word", xlab="",
        ylim=c(5,10))

boxplot(meanSentenceChars~country, data=german_df, ylab = "",
        col=(c("gray90","gray90", "gray90")),
        main="Avergage characters \nper sentence", xlab="")

boxplot(meanSentenceLength~country, data=german_df,ylab = "",
        col=(c("gray90","gray90", "gray90")),
        main="Average number of \nwords per sentence", xlab="")

boxplot(per_leipzig~country, data=german_df, ylab = "",
        col=(c("gray90","gray90", "gray90")),
        main="Percentage Leipzig \nCorpus Top 1000", xlab="", ylim=c(40,80))


# Figure SI 14

# Load data 
load("data/SI/figSI13_figSI16.RData")

boxplot(W~party_name, data=german_ger_df, notch=F, ylab = "",
        main="Number of words", xlab="")

boxplot(C~party_name, data=german_ger_df, notch=F, ylab = "",
        col=(c("gray90","gray90", "gray90")),
        main="Number of characters", xlab="")

boxplot(meanWordChars~party_name, data=german_ger_df, notch=F, ylab = "",
        col=(c("gray90","gray90", "gray90")),
        main="Avergage characters \nper word", xlab="",
        ylim=c(6,7.5))

boxplot(meanSentenceChars~party_name, data=german_ger_df, ylab = "",
        col=(c("gray90","gray90", "gray90")),
        main="Avergage characters \nper sentence", xlab="",
        ylim=c(50,200))

boxplot(meanSentenceLength~party_name, data=german_ger_df,ylab = "",
        col=(c("gray90","gray90", "gray90")),
        main="Average number of \nwords per sentence", xlab="",
        ylim=c(10,25))

boxplot(per_leipzig~party_name, data=german_ger_df, ylab = "",
        main="Percentage Leipzig \nCorpus Top 1000", xlab="", ylim=c(45,70))


# Figure SI 15

# Load data 
load("data/SI/figSI13_figSI16.RData")

boxplot(W~party_name, data=german_aut_df, notch=F, ylab = "",
        main="Number of words", xlab="")

boxplot(C~party_name, data=german_aut_df, notch=F, ylab = "",
        col=(c("gray90","gray90", "gray90")),
        main="Number of characters", xlab="")

boxplot(meanWordChars~party_name, data=german_aut_df, notch=F, ylab = "",
        col=(c("gray90","gray90", "gray90")),
        main="Avergage characters \nper word", xlab="",
        ylim=c(6,8))

boxplot(meanSentenceChars~party_name, data=german_aut_df, ylab = "",
        col=(c("gray90","gray90", "gray90")),
        main="Avergage characters \nper sentence", xlab="",
        ylim=c(50,200))

boxplot(meanSentenceLength~party_name, data=german_aut_df,ylab = "",
        col=(c("gray90","gray90", "gray90")),
        main="Average number of \nwords per sentence", xlab="",
        ylim=c(10,30))

boxplot(per_leipzig~party_name, data=german_aut_df, ylab = "",
        main="Percentage Leipzig \nCorpus Top 1000", xlab="", ylim=c(45,70))


# Figure SI 16

# Load data 
load("data/SI/figSI13_figSI16.RData")

boxplot(W~party_name, data=german_sui_df, notch=F, ylab = "",
        main="Number of words", xlab="")

boxplot(C~party_name, data=german_sui_df, notch=F, ylab = "",
        col=(c("gray90","gray90", "gray90")),
        main="Number of characters", xlab="")

boxplot(meanWordChars~party_name, data=german_sui_df, notch=F, ylab = "",
        col=(c("gray90","gray90", "gray90")),
        main="Avergage characters \nper word", xlab="",
        ylim=c(5,8))

boxplot(meanSentenceChars~party_name, data=german_sui_df, ylab = "",
        col=(c("gray90","gray90", "gray90")),
        main="Avergage characters \nper sentence", xlab="",
        ylim=c(50,200))

boxplot(meanSentenceLength~party_name, data=german_sui_df,ylab = "",
        col=(c("gray90","gray90", "gray90")),
        main="Average number of \nwords per sentence", xlab="",
        ylim=c(10,30))

boxplot(per_leipzig~party_name, data=german_sui_df, ylab = "",
        main="Percentage Leipzig \nCorpus Top 1000", xlab="", ylim=c(45,70))


# Figure SI 17

# Load data 
load("data/SI/figSI17.RData")

# Function to create density plot comparing Germany with another country
create_density_plot <- function(data, country) {
  ggplot(data = data[data$country %in% c("Germany", country), ],
         aes(x = complexity, fill = country)) +
    geom_density(alpha = 0.3) +
    scale_fill_manual(values = c("red", "blue")) + 
    labs(title = paste(""),
         x = "Complexity",
         y = "Density",
         fill = "") + 
    theme_fira() + theme(legend.position = "bottom")
}

# List of countries to compare with Germany
comparison_countries <- c("Netherlands", "Spain", "Sweden", "UK")

# Create side-by-side density plots for each comparison
plots_list <- lapply(comparison_countries, create_density_plot, data = speeches)
plots_combined <- do.call(gridExtra::grid.arrange, c(plots_list, ncol = 2))

# Figure SI 18

# Load data 
load("data/SI/figSI18.RData")

show_density_panels <- function(var_name, label, data = df, comparison_countries = c("Netherlands", "Spain", "UK")) {
  
  # Set x-axis limits for selected variables
  x_limits <- list(
    sentences = c(-250, 750),
    terms = c(0, 3000)
  )
  
  create_density_plot <- function(data, compare_to, var_name, label) {
    filtered_data <- data %>%
      filter(country %in% c("Germany", compare_to)) %>%
      mutate(country = factor(country, levels = c("Germany", compare_to)))  # enforce color order
    
    color_map <- setNames(c("red", "blue"), c("Germany", compare_to))
    
    p <- ggplot(filtered_data, aes_string(x = var_name, fill = "country")) +
      geom_density(alpha = 0.3) +
      scale_fill_manual(values = color_map, drop = FALSE) +
      labs(
        title = NULL,
        x = label,
        y = "Density",
        fill = ""
      ) +
      theme_fira() +
      theme(legend.position = "bottom")
    
    # Apply x-axis limits if defined
    if (var_name %in% names(x_limits)) {
      p <- p + xlim(x_limits[[var_name]])
    }
    
    return(p)
  }
  
  plots_list <- lapply(comparison_countries, function(cntry) {
    create_density_plot(data, cntry, var_name, label)
  })
  
  gridExtra::grid.arrange(grobs = plots_list, ncol = 3)
}


show_density_panels("flesh", "Flesch")
show_density_panels("entropy", "Entropy")
show_density_panels("sentences", "Number of sentences")
show_density_panels("sentlength", "Words per sentence")
show_density_panels("terms", "Number of words")


# Table SI 22

# Load data 
load("data/SI/tabSI22.RData")

# Define the comparison countries in desired order
comparison_countries <- c("Germany", "Austria", "Denmark", "France", "Italy", "Spain", "Sweden", "Switzerland", "UK")

# Recode gender and summarize
summary_table <- df1 |>
  filter(country %in% comparison_countries) |>
  mutate(
    gender = case_when(
      gender == 2 ~ "Male",
      gender == 1 ~ "Female",
      TRUE ~ NA_character_
    )
  ) |>
  group_by(country) |>
  summarise(
    N = n(),
    mean_age = mean(age, na.rm = TRUE),
    sd_age = sd(age, na.rm = TRUE),
    mean_edulevel = mean(edulevel, na.rm = TRUE),
    sd_edulevel = sd(edulevel, na.rm = TRUE),
    percent_male = mean(gender == "Male", na.rm = TRUE) * 100,
    percent_female = mean(gender == "Female", na.rm = TRUE) * 100
  ) |>
  mutate(country = factor(country, levels = comparison_countries)) |>
  arrange(country)

# Show table
summary_table


# Figure SI 19

# Load data 
load("data/SI/figSI19.RData")

knowledge_data <- knowledge_data |>
  mutate(knowledge = if_else(knowledge > 1, 0, knowledge))

knowledge_sum <- knowledge_data |>
  group_by(id, year) |>
  summarise(total_knowledge = sum(knowledge)) |>
  ungroup()

# Step 4: Calculate percentages
knowledge_percentages <- knowledge_sum |>
  group_by(year) |>
  count(total_knowledge) |>
  mutate(percentage = n / sum(n) * 100)

ggplot(knowledge_percentages, aes(x = total_knowledge, y = percentage)) +
  geom_bar(stat = "identity") +  # Use the actual y-values
  facet_wrap(~ year) +
  theme_light() +
  labs(title = "",
       x = "No. of known candidates",
       y = "Percentage") +
  scale_x_continuous(breaks = unique(knowledge_percentages$total_knowledge)) + ylim(0,75)







